Load packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(skimr)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(recipes)
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:stats':
## 
##     step

Read in data from CSV file (download to project folder)

airsat <- read_csv("airsatisfaction.csv") %>% print()
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   satisfaction = col_character(),
##   sex = col_character(),
##   customer_type = col_character(),
##   travel_type = col_character(),
##   class = col_character()
## )
## i Use `spec()` for the full column specifications.
## # A tibble: 10,000 x 23
##    satisfaction sex    customer_type   age travel_type class    flight_distance
##    <chr>        <chr>  <chr>         <dbl> <chr>       <chr>              <dbl>
##  1 dissatisfied Female loyal            39 Business    Business            2620
##  2 dissatisfied Female loyal            66 Business    Business            2364
##  3 satisfied    Female loyal             8 Personal    Eco                 1828
##  4 dissatisfied Male   loyal            43 Business    Eco                 3564
##  5 dissatisfied Female disloyal         26 Business    Eco                 2040
##  6 dissatisfied Male   loyal            29 Personal    Eco Plus            2439
##  7 satisfied    Female loyal            44 Business    Business             858
##  8 dissatisfied Male   disloyal         39 Business    Business            1610
##  9 dissatisfied Male   loyal            65 Personal    Business             691
## 10 satisfied    Female loyal            40 Business    Business            2889
## # ... with 9,990 more rows, and 16 more variables: seat_comfort <dbl>,
## #   time_convenience <dbl>, food_drink <dbl>, gate_location <dbl>,
## #   inflight_wifi <dbl>, inflight_entertainment <dbl>, online_support <dbl>,
## #   ease_booking <dbl>, onboard_service <dbl>, leg_room <dbl>,
## #   baggage_handling <dbl>, checkin_service <dbl>, cleanliness <dbl>,
## #   online_boarding <dbl>, departure_delay <dbl>, arrival_delay <dbl>

Create training and testing sets

set.seed(2021)
index <- createDataPartition(airsat$satisfaction, p = 0.8, list = FALSE)
airsat_train <- airsat[index, ]
airsat_test <- airsat[-index, ]

airsat_train
airsat_test

Explore the training set

skim(airsat_train)
Data summary
Name airsat_train
Number of rows 8001
Number of columns 23
_______________________
Column type frequency:
character 5
numeric 18
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
satisfaction 0 1 9 12 0 2 0
sex 0 1 4 6 0 2 0
customer_type 0 1 5 8 0 2 0
travel_type 0 1 8 8 0 2 0
class 0 1 3 8 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 39.52 14.95 7 27 40 51 85 ▃▇▇▅▁
flight_distance 0 1 1987.42 1037.12 50 1364 1918 2559 6595 ▃▇▃▁▁
seat_comfort 0 1 2.88 1.39 0 2 3 4 5 ▇▇▇▇▅
time_convenience 0 1 2.99 1.52 0 2 3 4 5 ▇▆▆▇▇
food_drink 0 1 2.90 1.43 0 2 3 4 5 ▇▇▇▇▆
gate_location 0 1 3.00 1.31 1 2 3 4 5 ▅▆▇▇▅
inflight_wifi 0 1 3.25 1.32 0 2 3 4 5 ▃▇▇▇▇
inflight_entertainment 0 1 3.42 1.33 0 3 4 4 5 ▂▃▅▇▆
online_support 0 1 3.53 1.31 1 3 4 5 5 ▃▃▅▇▇
ease_booking 0 1 3.48 1.31 1 2 4 5 5 ▃▅▅▇▇
onboard_service 0 1 3.48 1.27 1 3 4 4 5 ▂▃▅▇▆
leg_room 0 1 3.48 1.30 0 2 4 5 5 ▂▅▅▇▇
baggage_handling 0 1 3.70 1.16 1 3 4 5 5 ▂▂▃▇▆
checkin_service 0 1 3.34 1.27 1 3 3 4 5 ▃▃▇▇▆
cleanliness 0 1 3.70 1.15 1 3 4 5 5 ▁▂▃▇▆
online_boarding 0 1 3.35 1.31 1 2 4 4 5 ▃▅▇▇▇
departure_delay 0 1 14.32 36.88 0 0 0 12 569 ▇▁▁▁▁
arrival_delay 34 1 14.56 37.15 0 0 0 13 543 ▇▁▁▁▁

Create and prep recipe

airsat_recipe <- 
  airsat %>% 
  recipe(satisfaction ~ .) %>% 
  step_nzv(all_predictors()) %>% 
  step_lincomb(all_numeric_predictors()) %>% 
  step_normalize(all_numeric_predictors()) %>% 
  step_pca(all_numeric_predictors(), threshold = 0.9) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  prep(training = airsat_train, log_changes = TRUE)
## step_nzv (nzv_AY9ua): 
##  removed (2): departure_delay, arrival_delay
## 
## step_lincomb (lincomb_AySWE): same number of columns
## 
## step_normalize (normalize_V6vGz): same number of columns
## 
## step_pca (pca_wPZrK): 
##  new (11): PC01, PC02, PC03, PC04, PC05, PC06, PC07, PC08, PC09, PC10, ...
##  removed (16): age, flight_distance, seat_comfort, time_convenience, ...
## 
## step_dummy (dummy_5NO9d): 
##  new (5): sex_Male, customer_type_loyal, travel_type_Personal, class_Eco, ...
##  removed (4): sex, customer_type, travel_type, class

Bake new training set

airsat_baked_train <- bake(airsat_recipe, new_data = airsat_train) %>% print()
## # A tibble: 8,001 x 17
##    satisfaction   PC01   PC02    PC03   PC04   PC05    PC06   PC07    PC08
##    <fct>         <dbl>  <dbl>   <dbl>  <dbl>  <dbl>   <dbl>  <dbl>   <dbl>
##  1 dissatisfied -1.15  -0.759  0.759   0.808  1.46   0.537   0.645 -0.195 
##  2 satisfied     2.61  -0.466 -1.72    1.11  -1.17   0.169  -1.55   0.645 
##  3 dissatisfied -4.02  -1.50  -0.578   0.225 -0.182  1.48   -0.260  0.269 
##  4 dissatisfied -2.49   1.17  -0.922   0.771 -0.372 -2.31   -0.188 -0.537 
##  5 satisfied     1.30   0.121 -0.0532 -1.16  -0.598  0.0442 -0.499  0.119 
##  6 satisfied     1.95   1.68  -0.982   0.529  0.247  0.596   0.930 -0.0723
##  7 satisfied     1.03  -1.64   3.46    0.144  0.496  1.96   -0.750 -0.561 
##  8 satisfied     4.07  -1.69  -0.354  -1.20   1.16  -0.463  -0.101  0.214 
##  9 dissatisfied -1.87  -0.569  0.125   0.707  1.03   1.01    1.00  -0.220 
## 10 dissatisfied  0.830  2.56   1.18   -0.811  1.21  -2.19    0.729  0.635 
## # ... with 7,991 more rows, and 8 more variables: PC09 <dbl>, PC10 <dbl>,
## #   PC11 <dbl>, sex_Male <dbl>, customer_type_loyal <dbl>,
## #   travel_type_Personal <dbl>, class_Eco <dbl>, class_Eco.Plus <dbl>

Bake new testing set

airsat_baked_test <- bake(airsat_recipe, new_data = airsat_test) %>% print()
## # A tibble: 1,999 x 17
##    satisfaction    PC01   PC02   PC03   PC04   PC05    PC06   PC07   PC08   PC09
##    <fct>          <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
##  1 dissatisfied -0.0375 -1.71   1.42  -0.734  1.23   0.477   1.80  -0.529 -0.289
##  2 dissatisfied -1.34   -1.98   1.02   1.09  -0.159 -0.547   1.00  -1.16   0.424
##  3 dissatisfied -0.611   1.80  -0.977 -0.406 -0.620 -0.563  -0.337  1.14   1.01 
##  4 dissatisfied  3.14   -0.204 -0.823 -2.00  -0.308  0.439  -0.207  0.263  1.22 
##  5 satisfied    -3.41    1.42   0.348 -0.648 -1.16   0.235   0.494  2.17   0.462
##  6 satisfied     2.03   -2.01  -1.96   1.39   1.85   0.614  -0.593  0.450 -0.445
##  7 dissatisfied -1.36    1.71   0.360 -0.676 -0.705  0.0296 -1.06  -0.817  0.580
##  8 satisfied     0.107   1.29   1.89   0.600 -1.01  -0.422   1.75  -0.162  0.283
##  9 dissatisfied  2.34   -0.535  0.321  2.65   1.40  -0.711  -0.271  0.391 -0.561
## 10 satisfied    -0.368  -2.88  -2.23  -1.59   0.741 -1.10   -0.538 -0.646  1.41 
## # ... with 1,989 more rows, and 7 more variables: PC10 <dbl>, PC11 <dbl>,
## #   sex_Male <dbl>, customer_type_loyal <dbl>, travel_type_Personal <dbl>,
## #   class_Eco <dbl>, class_Eco.Plus <dbl>

Hands-on Activity

Modify the code above to accomplish the following goals:

  1. Use 75% of the data for training and 25% of the data for testing.

  2. Apply the Yeo-Johnson transformation to the flight_distance variable (before normalizing it).

  3. Instead of using PCA to address multicollinearity, drop highly correlated predictors.

  4. Use one-hot encoding for the nominal predictors instead of dummy codes.

Answer key

set.seed(2021)
index <- createDataPartition(airsat$satisfaction, p = 0.75, list = FALSE) #1
airsat_train <- airsat[index, ]
airsat_test <- airsat[-index, ]

airsat_train
airsat_test
airsat_recipe <- 
  airsat %>% 
  recipe(satisfaction ~ .) %>% 
  step_nzv(all_predictors()) %>% 
  step_lincomb(all_numeric_predictors()) %>% 
  step_YeoJohnson(flight_distance) %>% #2
  step_normalize(all_numeric_predictors()) %>% 
  step_corr(all_numeric_predictors()) %>% #3 
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>% #4 
  prep(training = airsat_train, log_changes = TRUE)
## step_nzv (nzv_fnnfs): 
##  removed (2): departure_delay, arrival_delay
## 
## step_lincomb (lincomb_s6zuF): same number of columns
## 
## step_YeoJohnson (YeoJohnson_gMFXy): same number of columns
## 
## step_normalize (normalize_7QqN7): same number of columns
## 
## step_corr (corr_NMHen): same number of columns
## 
## step_dummy (dummy_b1gwO): 
##  new (9): sex_Female, sex_Male, customer_type_disloyal, ...
##  removed (4): sex, customer_type, travel_type, class